Install necessary libraries

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(knitr)
library(formattable)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(leaps)
library(splines)
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ Matrix::expand()        masks tidyr::expand()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::lift()           masks caret::lift()
## ✖ randomForest::margin()  masks ggplot2::margin()
## ✖ Matrix::pack()          masks tidyr::pack()
## ✖ Matrix::unpack()        masks tidyr::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Loading & Checking Data

taxi_df <- read.csv("../taxi_Trips__2024.csv")
head(taxi_df)

{r}tlsg print(names(taxi_df)) print(nrow(taxi_df))

colSums(is.na(taxi_df))
##                    Trip.ID                    Taxi.ID 
##                          0                          1 
##       Trip.Start.Timestamp         Trip.End.Timestamp 
##                          0                          0 
##               Trip.Seconds                 Trip.Miles 
##                         87                          5 
##        Pickup.Census.Tract       Dropoff.Census.Tract 
##                     266886                     273775 
##      Pickup.Community.Area     Dropoff.Community.Area 
##                      11105                      42428 
##                       Fare                       Tips 
##                       1014                       1014 
##                      Tolls                     Extras 
##                       1014                       1014 
##                 Trip.Total               Payment.Type 
##                       1014                          0 
##                    Company   Pickup.Centroid.Latitude 
##                          0                      10989 
##  Pickup.Centroid.Longitude   Pickup.Centroid.Location 
##                      10989                          0 
##  Dropoff.Centroid.Latitude Dropoff.Centroid.Longitude 
##                      39982                      39982 
## Dropoff.Centroid..Location 
##                          0
(unique(taxi_df$Company))
##  [1] "Flash Cab"                           
##  [2] "Taxicab Insurance Agency Llc"        
##  [3] "Globe Taxi"                          
##  [4] "5 Star Taxi"                         
##  [5] "City Service"                        
##  [6] "Chicago Independents"                
##  [7] "Blue Ribbon Taxi Association"        
##  [8] "Taxi Affiliation Services"           
##  [9] "Chicago City Taxi Association"       
## [10] "Choice Taxi Association"             
## [11] "Medallion Leasin"                    
## [12] "Sun Taxi"                            
## [13] "U Taxicab"                           
## [14] "Taxicab Insurance Agency, LLC"       
## [15] "Choice Taxi Association Inc"         
## [16] "Chicago Taxicab"                     
## [17] "Patriot Taxi Dba Peace Taxi Associat"
## [18] "Setare Inc"                          
## [19] "Taxi Affiliation Services Llc - Yell"
## [20] "3556 - 36214 RC Andrews Cab"         
## [21] "Top Cab"                             
## [22] "Koam Taxi Association"               
## [23] "312 Medallion Management Corp"       
## [24] "Star North Taxi Management Llc"      
## [25] "6574 - Babylon Express Inc."         
## [26] "5167 - 71969 5167 Taxi Inc"          
## [27] "2733 - 74600 Benny Jona"             
## [28] "3591 - 63480 Chuks Cab"              
## [29] "Tac - Yellow Cab Association"        
## [30] "Metro Jet Taxi A."                   
## [31] "4787 - 56058 Reny Cab Co"            
## [32] "4623 - 27290 Jay Kim"                
## [33] "4053 - 40193 Adwar H. Nikola"        
## [34] "Petani Cab Corp"                     
## [35] "Tac - Checker Cab Dispatch"

Understanding the features

feature_desc <- read.csv("taxi_Trips_2024_Feature_descriptions.csv")
feature_desc
notes <- list()
for (feature in names(taxi_df)) {
  
  curr_note <- paste("Valid rows:",(nrow(taxi_df) - sum(is.na(taxi_df[,feature]))),
                     "; N/A rows:", sum(is.na(taxi_df[,feature])),
                     "; Unique values:", length(unique(taxi_df[,feature])))
  notes <- append(notes, curr_note)
}
notes
## [[1]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 425229"
## 
## [[2]]
## [1] "Valid rows: 425228 ; N/A rows: 1 ; Unique values: 2520"
## 
## [[3]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2977"
## 
## [[4]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2988"
## 
## [[5]]
## [1] "Valid rows: 425142 ; N/A rows: 87 ; Unique values: 6077"
## 
## [[6]]
## [1] "Valid rows: 425224 ; N/A rows: 5 ; Unique values: 4278"
## 
## [[7]]
## [1] "Valid rows: 158343 ; N/A rows: 266886 ; Unique values: 221"
## 
## [[8]]
## [1] "Valid rows: 151454 ; N/A rows: 273775 ; Unique values: 347"
## 
## [[9]]
## [1] "Valid rows: 414124 ; N/A rows: 11105 ; Unique values: 78"
## 
## [[10]]
## [1] "Valid rows: 382801 ; N/A rows: 42428 ; Unique values: 78"
## 
## [[11]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 4642"
## 
## [[12]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 2161"
## 
## [[13]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 139"
## 
## [[14]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 865"
## 
## [[15]]
## [1] "Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 7816"
## 
## [[16]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 7"
## 
## [[17]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 35"
## 
## [[18]]
## [1] "Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276"
## 
## [[19]]
## [1] "Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276"
## 
## [[20]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 277"
## 
## [[21]]
## [1] "Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 366"
## 
## [[22]]
## [1] "Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 365"
## 
## [[23]]
## [1] "Valid rows: 425229 ; N/A rows: 0 ; Unique values: 366"
feature_desc$Notes <- unlist(notes)
feature_desc
# Install the formattable package if not already installed
if (!require(formattable)) {
  install.packages("formattable")
}

formattable(feature_desc, align = c("l", "l", "l", "l"), list(Notes = formatter("span", style = "color:blue"), Type = formatter("span", style = "color:green"))
)
Column.Name Description Type Notes
Trip ID A unique identifier for the trip. Plain Text Valid rows: 425229 ; N/A rows: 0 ; Unique values: 425229
Taxi ID A unique identifier for the taxi. Plain Text Valid rows: 425228 ; N/A rows: 1 ; Unique values: 2520
Trip Start Timestamp When the trip started, rounded to the nearest 15 minutes. Date & Time Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2977
Trip End Timestamp When the trip ended, rounded to the nearest 15 minutes. Date & Time Valid rows: 425229 ; N/A rows: 0 ; Unique values: 2988
Trip Seconds Time of the trip in seconds. Number Valid rows: 425142 ; N/A rows: 87 ; Unique values: 6077
Trip Miles Distance of the trip in miles. Number Valid rows: 425224 ; N/A rows: 5 ; Unique values: 4278
Pickup Census Tract The Census Tract where the trip began. For privacy, this Census Tract is not shown for some trips. This column often will be blank for locations outside Chicago. Plain Text Valid rows: 158343 ; N/A rows: 266886 ; Unique values: 221
Dropoff Census Tract The Census Tract where the trip ended. For privacy, this Census Tract is not shown for some trips. This column often will be blank for locations outside Chicago. Plain Text Valid rows: 151454 ; N/A rows: 273775 ; Unique values: 347
Pickup Community Area The Community Area where the trip began. This column will be blank for locations outside Chicago. Number Valid rows: 414124 ; N/A rows: 11105 ; Unique values: 78
Dropoff Community Area The Community Area where the trip ended. This column will be blank for locations outside Chicago. Number Valid rows: 382801 ; N/A rows: 42428 ; Unique values: 78
Fare The fare for the trip. Number Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 4642
Tips The tip for the trip. Cash tips generally will not be recorded. Number Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 2161
Tolls The tolls for the trip. Number Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 139
Extras Extra charges for the trip. Number Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 865
Trip Total Total cost of the trip, the total of the previous columns. Number Valid rows: 424215 ; N/A rows: 1014 ; Unique values: 7816
Payment Type Type of payment for the trip. Plain Text Valid rows: 425229 ; N/A rows: 0 ; Unique values: 7
Company The taxi company. Plain Text Valid rows: 425229 ; N/A rows: 0 ; Unique values: 35
Pickup Centroid Latitude The latitude of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. Number Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276
Pickup Centroid Longitude The longitude of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. Number Valid rows: 414240 ; N/A rows: 10989 ; Unique values: 276
Pickup Centroid Location The location of the center of the pickup census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. Point Valid rows: 425229 ; N/A rows: 0 ; Unique values: 277
Dropoff Centroid Latitude The latitude of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. Number Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 366
Dropoff Centroid Longitude The longitude of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. Number Valid rows: 385247 ; N/A rows: 39982 ; Unique values: 365
Dropoff Centroid Location The location of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy. This column often will be blank for locations outside Chicago. Point Valid rows: 425229 ; N/A rows: 0 ; Unique values: 366
length(which(taxi_df$Fare + taxi_df$Tips + taxi_df$Tolls + taxi_df$Extras != taxi_df$Trip.Total))
## [1] 136222

1. Data Cleaning

1.1 Dropping unwanted features

features_to_drop <- c("Trip.End.Timestamp", "Pickup.Census.Tract", "Dropoff.Census.Tract", "Pickup.Centroid.Latitude", "Pickup.Centroid.Longitude", "Pickup.Centroid.Location", "Dropoff.Centroid.Latitude", "Dropoff.Centroid.Longitude", "Dropoff.Centroid..Location")

simplified_taxi_df <- subset(taxi_df, select = -c(Trip.End.Timestamp, Pickup.Census.Tract, Dropoff.Census.Tract, Pickup.Centroid.Latitude, Pickup.Centroid.Longitude, Pickup.Centroid.Location, Dropoff.Centroid.Latitude, Dropoff.Centroid.Longitude, Dropoff.Centroid..Location))
head(simplified_taxi_df)

1.2 Removing Null rows

dim(simplified_taxi_df)
## [1] 425229     14
colSums(is.na(simplified_taxi_df))
##                Trip.ID                Taxi.ID   Trip.Start.Timestamp 
##                      0                      1                      0 
##           Trip.Seconds             Trip.Miles  Pickup.Community.Area 
##                     87                      5                  11105 
## Dropoff.Community.Area                   Fare                   Tips 
##                  42428                   1014                   1014 
##                  Tolls                 Extras             Trip.Total 
##                   1014                   1014                   1014 
##           Payment.Type                Company 
##                      0                      0
cleaned_taxi_df <- simplified_taxi_df[!apply(is.na(simplified_taxi_df), 1, any), ]
colSums(is.na(cleaned_taxi_df))
##                Trip.ID                Taxi.ID   Trip.Start.Timestamp 
##                      0                      0                      0 
##           Trip.Seconds             Trip.Miles  Pickup.Community.Area 
##                      0                      0                      0 
## Dropoff.Community.Area                   Fare                   Tips 
##                      0                      0                      0 
##                  Tolls                 Extras             Trip.Total 
##                      0                      0                      0 
##           Payment.Type                Company 
##                      0                      0
dim(cleaned_taxi_df)
## [1] 379024     14
head(cleaned_taxi_df)

1.3 Converting seconds to mins

cleaned_taxi_df$Trip.Minutes <- round(cleaned_taxi_df$Trip.Seconds / 60, digits = 2) 
cleaned_taxi_df$Trip.Seconds <- NULL 
head(cleaned_taxi_df)

1.4 Converting timestamp to hour of the day & day of the week

#cleaned_taxi_df$Trip.Start.Timestamp <- as.POSIXct(cleaned_taxi_df$Trip.Start.Timestamp, format = "%m/%d/%Y %I:%M:%S %p")

# ---------------------------------------------------------------------------------
# If you are facing any errors with the format used above, try using the one below 
# ---------------------------------------------------------------------------------
cleaned_taxi_df$Trip.Start.Timestamp <- as.POSIXct(cleaned_taxi_df$Trip.Start.Timestamp, format = "%m/%d/%y %H:%M")


cleaned_taxi_df$Trip.Start.Date <- as.Date(cleaned_taxi_df$Trip.Start.Timestamp)

cleaned_taxi_df$Trip.Hour.Of.The.Day <- as.integer(format(cleaned_taxi_df$Trip.Start.Timestamp, format = "%H"))
cleaned_taxi_df$Trip.Hour.Of.The.Day <- as.factor(cleaned_taxi_df$Trip.Hour.Of.The.Day)

days_of_week <- c("Sunday" = 1, "Monday" = 2, "Tuesday" = 3, "Wednesday" = 4, "Thursday" = 5, "Friday" = 6, "Saturday" = 7)
cleaned_taxi_df$Trip.Day.Of.The.Week <- as.integer(days_of_week[weekdays(cleaned_taxi_df$Trip.Start.Timestamp)])
cleaned_taxi_df$Trip.Day.Of.The.Week <- as.factor(cleaned_taxi_df$Trip.Day.Of.The.Week)
head(cleaned_taxi_df)

1.5 Convert datatype of payment type

cleaned_taxi_df$Payment.Type <- as.factor(cleaned_taxi_df$Payment.Type)
cleaned_taxi_df$Company <- as.factor(cleaned_taxi_df$Company)
cleaned_taxi_df$Taxi.ID <- as.factor(cleaned_taxi_df$Taxi.ID)
head(cleaned_taxi_df)

1.6 Convert datatype of community area to string

cleaned_taxi_df$Pickup.Community.Area <- as.factor(cleaned_taxi_df$Pickup.Community.Area)
cleaned_taxi_df$Dropoff.Community.Area <- as.factor(cleaned_taxi_df$Dropoff.Community.Area)

1.7 Check dimension and summary of the cleaned dataset

dim(cleaned_taxi_df)
## [1] 379024     17
names(cleaned_taxi_df)
##  [1] "Trip.ID"                "Taxi.ID"                "Trip.Start.Timestamp"  
##  [4] "Trip.Miles"             "Pickup.Community.Area"  "Dropoff.Community.Area"
##  [7] "Fare"                   "Tips"                   "Tolls"                 
## [10] "Extras"                 "Trip.Total"             "Payment.Type"          
## [13] "Company"                "Trip.Minutes"           "Trip.Start.Date"       
## [16] "Trip.Hour.Of.The.Day"   "Trip.Day.Of.The.Week"
summary(cleaned_taxi_df)
##    Trip.ID         
##  Length:379024     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##                    
##                                                                                                                              Taxi.ID      
##  d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe:   828  
##  2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba38b064d7d6f4acc260b68ef1ae3ce06dad845178107940b3493fa99640f0f70c25d15cf57336ab7b8f:   739  
##  abd1ffa32433ceabeb49f4461015b38ddc252847ed3a29320aee6af650ba1e927195d191bf191f4f6f329ad7512a3f0f8e43ea844f3ead6f7c50fc4f0ccff08a:   637  
##  37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f40c5b81eae90f8f611027c67455f43b426f4c34dcb7fdb6697c82a3c6d00237f11a4a6cf5b1d1ce0c7:   635  
##  13016372e777da1289d557edbe4ce2be8a68e77bc64768acaf5e0539b10be2ca089238dc27408b49b17899014e6e178e17c3ba455812fd84024f93e266324439:   633  
##  8da9e1d18757022c6a6a614fc2d38483e38aae441feff500095a83ebc68006cf88329f2c28e35ba92ead14037739f9971a8a2852f946ebc59d0160c4f1104ec8:   630  
##  (Other)                                                                                                                         :374922  
##  Trip.Start.Timestamp               Trip.Miles      Pickup.Community.Area
##  Min.   :2024-01-01 00:00:00.00   Min.   :  0.000   8      :82654        
##  1st Qu.:2024-01-09 20:45:00.00   1st Qu.:  0.880   76     :72260        
##  Median :2024-01-18 10:00:00.00   Median :  2.570   32     :63444        
##  Mean   :2024-01-17 10:42:18.18   Mean   :  6.085   28     :42272        
##  3rd Qu.:2024-01-24 16:30:00.00   3rd Qu.: 11.270   6      :12880        
##  Max.   :2024-02-01 00:00:00.00   Max.   :664.900   56     :12047        
##                                                     (Other):93467        
##  Dropoff.Community.Area      Fare              Tips             Tolls         
##  8      : 92741         Min.   :   0.00   Min.   :  0.000   Min.   :   0.000  
##  32     : 64694         1st Qu.:   7.75   1st Qu.:  0.000   1st Qu.:   0.000  
##  28     : 40248         Median :  14.00   Median :  0.040   Median :   0.000  
##  6      : 19276         Mean   :  20.72   Mean   :  2.592   Mean   :   0.054  
##  76     : 18269         3rd Qu.:  32.25   3rd Qu.:  3.700   3rd Qu.:   0.000  
##  7      : 16568         Max.   :1525.00   Max.   :200.000   Max.   :4444.440  
##  (Other):127228                                                               
##      Extras           Trip.Total           Payment.Type   
##  Min.   :   0.000   Min.   :   0.00   Cash       :108481  
##  1st Qu.:   0.000   1st Qu.:   9.75   Credit Card:136043  
##  Median :   0.000   Median :  16.50   Dispute    :   102  
##  Mean   :   1.327   Mean   :  24.86   Mobile     : 62672  
##  3rd Qu.:   1.000   3rd Qu.:  36.30   No Charge  :   244  
##  Max.   :5051.100   Max.   :8912.13   Prcard     : 50556  
##                                       Unknown    : 20926  
##                          Company       Trip.Minutes     Trip.Start.Date     
##  Flash Cab                   :86779   Min.   :   0.00   Min.   :2024-01-01  
##  Taxi Affiliation Services   :71541   1st Qu.:   7.28   1st Qu.:2024-01-10  
##  Sun Taxi                    :40352   Median :  14.23   Median :2024-01-18  
##  Taxicab Insurance Agency Llc:39522   Mean   :  18.48   Mean   :2024-01-17  
##  City Service                :35399   3rd Qu.:  25.72   3rd Qu.:2024-01-24  
##  Chicago Independents        :21779   Max.   :1435.58   Max.   :2024-02-01  
##  (Other)                     :83652                                         
##  Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
##  17     : 28075       1:34446             
##  16     : 27115       2:59096             
##  15     : 26222       3:67076             
##  18     : 25558       4:72588             
##  14     : 25184       5:59096             
##  13     : 24860       6:50350             
##  (Other):222010       7:36372
head(cleaned_taxi_df)
saveRDS(cleaned_taxi_df, "taxi_df_data_cleaning.rds")
summary(cleaned_taxi_df)
##    Trip.ID         
##  Length:379024     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##                    
##                                                                                                                              Taxi.ID      
##  d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe:   828  
##  2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba38b064d7d6f4acc260b68ef1ae3ce06dad845178107940b3493fa99640f0f70c25d15cf57336ab7b8f:   739  
##  abd1ffa32433ceabeb49f4461015b38ddc252847ed3a29320aee6af650ba1e927195d191bf191f4f6f329ad7512a3f0f8e43ea844f3ead6f7c50fc4f0ccff08a:   637  
##  37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f40c5b81eae90f8f611027c67455f43b426f4c34dcb7fdb6697c82a3c6d00237f11a4a6cf5b1d1ce0c7:   635  
##  13016372e777da1289d557edbe4ce2be8a68e77bc64768acaf5e0539b10be2ca089238dc27408b49b17899014e6e178e17c3ba455812fd84024f93e266324439:   633  
##  8da9e1d18757022c6a6a614fc2d38483e38aae441feff500095a83ebc68006cf88329f2c28e35ba92ead14037739f9971a8a2852f946ebc59d0160c4f1104ec8:   630  
##  (Other)                                                                                                                         :374922  
##  Trip.Start.Timestamp               Trip.Miles      Pickup.Community.Area
##  Min.   :2024-01-01 00:00:00.00   Min.   :  0.000   8      :82654        
##  1st Qu.:2024-01-09 20:45:00.00   1st Qu.:  0.880   76     :72260        
##  Median :2024-01-18 10:00:00.00   Median :  2.570   32     :63444        
##  Mean   :2024-01-17 10:42:18.18   Mean   :  6.085   28     :42272        
##  3rd Qu.:2024-01-24 16:30:00.00   3rd Qu.: 11.270   6      :12880        
##  Max.   :2024-02-01 00:00:00.00   Max.   :664.900   56     :12047        
##                                                     (Other):93467        
##  Dropoff.Community.Area      Fare              Tips             Tolls         
##  8      : 92741         Min.   :   0.00   Min.   :  0.000   Min.   :   0.000  
##  32     : 64694         1st Qu.:   7.75   1st Qu.:  0.000   1st Qu.:   0.000  
##  28     : 40248         Median :  14.00   Median :  0.040   Median :   0.000  
##  6      : 19276         Mean   :  20.72   Mean   :  2.592   Mean   :   0.054  
##  76     : 18269         3rd Qu.:  32.25   3rd Qu.:  3.700   3rd Qu.:   0.000  
##  7      : 16568         Max.   :1525.00   Max.   :200.000   Max.   :4444.440  
##  (Other):127228                                                               
##      Extras           Trip.Total           Payment.Type   
##  Min.   :   0.000   Min.   :   0.00   Cash       :108481  
##  1st Qu.:   0.000   1st Qu.:   9.75   Credit Card:136043  
##  Median :   0.000   Median :  16.50   Dispute    :   102  
##  Mean   :   1.327   Mean   :  24.86   Mobile     : 62672  
##  3rd Qu.:   1.000   3rd Qu.:  36.30   No Charge  :   244  
##  Max.   :5051.100   Max.   :8912.13   Prcard     : 50556  
##                                       Unknown    : 20926  
##                          Company       Trip.Minutes     Trip.Start.Date     
##  Flash Cab                   :86779   Min.   :   0.00   Min.   :2024-01-01  
##  Taxi Affiliation Services   :71541   1st Qu.:   7.28   1st Qu.:2024-01-10  
##  Sun Taxi                    :40352   Median :  14.23   Median :2024-01-18  
##  Taxicab Insurance Agency Llc:39522   Mean   :  18.48   Mean   :2024-01-17  
##  City Service                :35399   3rd Qu.:  25.72   3rd Qu.:2024-01-24  
##  Chicago Independents        :21779   Max.   :1435.58   Max.   :2024-02-01  
##  (Other)                     :83652                                         
##  Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
##  17     : 28075       1:34446             
##  16     : 27115       2:59096             
##  15     : 26222       3:67076             
##  18     : 25558       4:72588             
##  14     : 25184       5:59096             
##  13     : 24860       6:50350             
##  (Other):222010       7:36372

2. Exploratory Data Analysis

2.1 Check and remove rows that has outliers

cleaned_taxi_df <- readRDS("taxi_df_data_cleaning.rds")
attach(cleaned_taxi_df)
summary(cleaned_taxi_df)
##    Trip.ID         
##  Length:379024     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##                    
##                                                                                                                              Taxi.ID      
##  d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe:   828  
##  2780ead18beaa862cc67315ddabd9d1acaadcd6da82eba38b064d7d6f4acc260b68ef1ae3ce06dad845178107940b3493fa99640f0f70c25d15cf57336ab7b8f:   739  
##  abd1ffa32433ceabeb49f4461015b38ddc252847ed3a29320aee6af650ba1e927195d191bf191f4f6f329ad7512a3f0f8e43ea844f3ead6f7c50fc4f0ccff08a:   637  
##  37073e8c9e454886fe4a916f80a9a3478570e7dd3e663f40c5b81eae90f8f611027c67455f43b426f4c34dcb7fdb6697c82a3c6d00237f11a4a6cf5b1d1ce0c7:   635  
##  13016372e777da1289d557edbe4ce2be8a68e77bc64768acaf5e0539b10be2ca089238dc27408b49b17899014e6e178e17c3ba455812fd84024f93e266324439:   633  
##  8da9e1d18757022c6a6a614fc2d38483e38aae441feff500095a83ebc68006cf88329f2c28e35ba92ead14037739f9971a8a2852f946ebc59d0160c4f1104ec8:   630  
##  (Other)                                                                                                                         :374922  
##  Trip.Start.Timestamp               Trip.Miles      Pickup.Community.Area
##  Min.   :2024-01-01 00:00:00.00   Min.   :  0.000   8      :82654        
##  1st Qu.:2024-01-09 20:45:00.00   1st Qu.:  0.880   76     :72260        
##  Median :2024-01-18 10:00:00.00   Median :  2.570   32     :63444        
##  Mean   :2024-01-17 10:42:18.18   Mean   :  6.085   28     :42272        
##  3rd Qu.:2024-01-24 16:30:00.00   3rd Qu.: 11.270   6      :12880        
##  Max.   :2024-02-01 00:00:00.00   Max.   :664.900   56     :12047        
##                                                     (Other):93467        
##  Dropoff.Community.Area      Fare              Tips             Tolls         
##  8      : 92741         Min.   :   0.00   Min.   :  0.000   Min.   :   0.000  
##  32     : 64694         1st Qu.:   7.75   1st Qu.:  0.000   1st Qu.:   0.000  
##  28     : 40248         Median :  14.00   Median :  0.040   Median :   0.000  
##  6      : 19276         Mean   :  20.72   Mean   :  2.592   Mean   :   0.054  
##  76     : 18269         3rd Qu.:  32.25   3rd Qu.:  3.700   3rd Qu.:   0.000  
##  7      : 16568         Max.   :1525.00   Max.   :200.000   Max.   :4444.440  
##  (Other):127228                                                               
##      Extras           Trip.Total           Payment.Type   
##  Min.   :   0.000   Min.   :   0.00   Cash       :108481  
##  1st Qu.:   0.000   1st Qu.:   9.75   Credit Card:136043  
##  Median :   0.000   Median :  16.50   Dispute    :   102  
##  Mean   :   1.327   Mean   :  24.86   Mobile     : 62672  
##  3rd Qu.:   1.000   3rd Qu.:  36.30   No Charge  :   244  
##  Max.   :5051.100   Max.   :8912.13   Prcard     : 50556  
##                                       Unknown    : 20926  
##                          Company       Trip.Minutes     Trip.Start.Date     
##  Flash Cab                   :86779   Min.   :   0.00   Min.   :2024-01-01  
##  Taxi Affiliation Services   :71541   1st Qu.:   7.28   1st Qu.:2024-01-10  
##  Sun Taxi                    :40352   Median :  14.23   Median :2024-01-18  
##  Taxicab Insurance Agency Llc:39522   Mean   :  18.48   Mean   :2024-01-17  
##  City Service                :35399   3rd Qu.:  25.72   3rd Qu.:2024-01-24  
##  Chicago Independents        :21779   Max.   :1435.58   Max.   :2024-02-01  
##  (Other)                     :83652                                         
##  Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
##  17     : 28075       1:34446             
##  16     : 27115       2:59096             
##  15     : 26222       3:67076             
##  18     : 25558       4:72588             
##  14     : 25184       5:59096             
##  13     : 24860       6:50350             
##  (Other):222010       7:36372
dim(cleaned_taxi_df)
## [1] 379024     17
hist(Fare, breaks = 50, main = "Histogram of Total fare", xlab = "Total Fare")

summary(Fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.75   14.00   20.72   32.25 1525.00

Looking at the histogram and the boxplot, there are some extreme values that hinders accuracy of our future model, hence needs to be removed.

fare_related_features <- cleaned_taxi_df[, c('Fare','Tips','Tolls','Extras','Trip.Total')]
boxplot(fare_related_features)

# Define the function to detect outliers for a single column
is.outlier <- function(x) {
  iqr <- IQR(x, na.rm = TRUE)
  lower <- quantile(x, 0.25, na.rm = TRUE) - 1.5 * iqr
  upper <- quantile(x, 0.75, na.rm = TRUE) + 1.5 * iqr
  return(x < lower | x > upper)
}

outliers <- is.outlier(Trip.Total)

number of outliers

# Define the function to detect outliers for a single column
is.outlier <- function(x) {
  iqr <- IQR(x, na.rm = TRUE)
  lower <- quantile(x, 0.25, na.rm = TRUE) - 1.5 * iqr
  upper <- quantile(x, 0.75, na.rm = TRUE) + 1.5 * iqr
  return(x < lower | x > upper)
}

# List of numerical columns to check for outliers
numerical_cols <- c("Fare", "Tips", "Tolls", "Extras", "Trip.Total", "Trip.Minutes", "Trip.Miles")

# Initialize a logical vector to store the rows to keep (no outliers)
rows_to_keep <- rep(TRUE, nrow(cleaned_taxi_df))

# Initialize a vector to store the count of outliers for each feature
outlier_counts <- numeric(length(numerical_cols))
names(outlier_counts) <- numerical_cols

# Loop through each numerical column
for (col in numerical_cols) {
  # Find the outliers in the column
  outliers <- is.outlier(cleaned_taxi_df[[col]])
  # Store the count of outliers for the feature
  outlier_counts[col] <- sum(outliers)
  rows_to_keep <- rows_to_keep & !outliers
}

cleaned_taxi_df <- cleaned_taxi_df[rows_to_keep, ]

# Print the number of outliers for each feature
print(outlier_counts)
##         Fare         Tips        Tolls       Extras   Trip.Total Trip.Minutes 
##         2439        34169          534        82339         3889         9271 
##   Trip.Miles 
##         1603
dim(cleaned_taxi_df)
## [1] 285958     17
distance_features <- cleaned_taxi_df[, c('Trip.Miles')]
boxplot(distance_features)

2.2 Histrogram of each features

hist(cleaned_taxi_df$Fare, breaks = 25, main = "Histogram of Fare", xlab = "Fare")

summary(cleaned_taxi_df$Fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    6.75   10.00   14.83   21.50   69.00
hist(cleaned_taxi_df$Tips, breaks = 50, main = "Histogram of Tips", xlab = "Tips")

summary(cleaned_taxi_df$Tips)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.235   2.220   9.250
hist(cleaned_taxi_df$Tolls, breaks = 50, main = "Histogram of Tolls", xlab = "Tolls")

summary(cleaned_taxi_df$Tolls)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
hist(cleaned_taxi_df$Extras, breaks = 50, main = "Histogram of Extras", xlab = "Extras")

summary(cleaned_taxi_df$Extras)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1357  0.0000  2.5000
hist(cleaned_taxi_df$Trip.Total, breaks = 25, main = "Histogram of Trip.Total", xlab = "Trip.Total")

summary(cleaned_taxi_df$Trip.Total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.40   12.00   16.33   23.25   75.50
hist(cleaned_taxi_df$Trip.Miles, breaks = 25, main = "Histogram of Trip.Miles", xlab = "Trip.Miles")

summary(cleaned_taxi_df$Trip.Miles)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.720   1.680   3.783   5.740  26.770
boxplot(cleaned_taxi_df[, c('Fare','Tips','Tolls','Extras','Trip.Total')])

boxplot(cleaned_taxi_df[, c('Trip.Miles')])

2.3 Heatmap of correlation of each features

# Select only numeric columns for correlation
numeric_columns <- sapply(cleaned_taxi_df, is.numeric)
cor_matrix <- cor(cleaned_taxi_df[, numeric_columns], use = "complete.obs")
## Warning in cor(cleaned_taxi_df[, numeric_columns], use = "complete.obs"): the
## standard deviation is zero
print(cor_matrix)
##               Trip.Miles        Fare       Tips Tolls      Extras  Trip.Total
## Trip.Miles    1.00000000  0.83320594 0.02935890    NA -0.08140904  0.81726238
## Fare          0.83320594  1.00000000 0.06029796    NA -0.10636693  0.98411311
## Tips          0.02935890  0.06029796 1.00000000    NA  0.05050705  0.23003575
## Tolls                 NA          NA         NA     1          NA          NA
## Extras       -0.08140904 -0.10636693 0.05050705    NA  1.00000000 -0.05773196
## Trip.Total    0.81726238  0.98411311 0.23003575    NA -0.05773196  1.00000000
## Trip.Minutes  0.70885089  0.75850279 0.01635369    NA -0.06630001  0.74146775
##              Trip.Minutes
## Trip.Miles     0.70885089
## Fare           0.75850279
## Tips           0.01635369
## Tolls                  NA
## Extras        -0.06630001
## Trip.Total     0.74146775
## Trip.Minutes   1.00000000
cor_data <- as.data.frame(as.table(cor_matrix))

ggplot(cor_data, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +  # This creates the heatmap boxes
  geom_text(aes(label = sprintf("%.2f", Freq)), vjust = 1, color = "black", size = 3) +  # Adjust size as needed
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle("Correlation Matrix Heatmap")

Trip.Total is in high correlation relationship with ‘Fare, Trip.Miles, Tips’, but almost no relationship with ‘Tolls, Hour of the trip’

2.4 Other data exploration

1) Average Number of Trips per Taxi in a Day Over Time

# Generate a sequence of dates within the range of your data
date_range <- seq(min(cleaned_taxi_df$Trip.Start.Date), max(cleaned_taxi_df$Trip.Start.Date), by = "day")

# Create a data frame of weekend dates
weekend_dates <- data.frame(Date = date_range[weekdays(date_range) %in% c("Saturday", "Sunday")])

# Group by Taxi ID and Date, then summarize the average trips
average_trips_per_taxi <- cleaned_taxi_df %>%
  group_by(Taxi.ID, Trip.Start.Date) %>%
  summarise(Trips = n(), .groups = 'drop') %>%
  group_by(Trip.Start.Date) %>%
  summarise(AvgTrips = mean(Trips), .groups = 'drop')

# Plot the average trips per taxi over time
plot <- ggplot(average_trips_per_taxi, aes(x = Trip.Start.Date, y = AvgTrips)) +
  geom_line() +
  labs(title = "Average Number of Trips per Taxi in a Day Over Time",
       x = "Date",
       y = " ") +
  theme_minimal()

# Add vertical lines for weekends
plot + geom_vline(data = weekend_dates, aes(xintercept = as.numeric(Date)), color = "red", linetype = "dashed")

2) Total Trips Over Time

label_thousands <- function(x) {
  paste0(x / 1000, "k")
}

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following objects are masked from 'package:formattable':
## 
##     comma, percent, scientific
# First, calculate the total trips per day
total_trips_over_time <- cleaned_taxi_df %>%
  group_by(Trip.Start.Date) %>%
  summarise(TotalTrips = n(), .groups = 'drop')

total_trips_plot <- ggplot(total_trips_over_time, aes(x = Trip.Start.Date, y = TotalTrips)) +
  geom_line() +
  scale_y_continuous(labels = label_thousands) +
  labs(title = "Total Trips Over Time",
       x = "Date",
       y = " ") +
  theme_minimal()

# Assuming weekend_dates is a vector that contains the dates of weekends
total_trips_plot <- total_trips_plot +
  geom_vline(data = data.frame(Date = weekend_dates), aes(xintercept = as.numeric(Date)),
             color = "red", linetype = "dashed")

print(total_trips_plot)

We can see that people tends not to use taxi on weekends than weekdays.

3) Pickup and Dropoff Area Count - Overlaid Histogram

# Create a long format data frame for pickup and dropoff areas
area_data <- tidyr::pivot_longer(
  cleaned_taxi_df,
  cols = c("Pickup.Community.Area", "Dropoff.Community.Area"),
  names_to = "AreaType",
  values_to = "Area"
)

# Plot overlaid bar charts for pickup and dropoff area counts
ggplot(area_data, aes(x = as.factor(Area), fill = AreaType)) +
  geom_bar(position = "identity", alpha = 0.5) + # Set alpha for transparency
  scale_y_continuous(labels = label_thousands) + # Format y-axis labels
  labs(title = "Overlaid Bar Chart of Pickup and Dropoff Area Counts",
       x = "Area Code",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_manual(values = c("Pickup.Community.Area" = "blue", "Dropoff.Community.Area" = "red"))

4) Trip.Total by Payment Type

# Define a custom label function for y axis to display in millions
label_millions <- function(x) {
  paste0(formatC(x / 1e6, format = "f", digits = 1), "M")
}

# Summarise Trip.Total by Payment Type
trip_total_by_PaymentType <- cleaned_taxi_df %>%
  group_by(Payment.Type) %>%
  summarise(TotalRevenue = sum(Trip.Total), .groups = 'drop')

# Create a bar plot of Trip Total by Payment Type
ggplot(trip_total_by_PaymentType, aes(x = Payment.Type, y = TotalRevenue, fill = Payment.Type)) +
  geom_col() +  # This creates a bar chart with pre-summarized data
  scale_y_continuous(labels = label_millions) + # Use the custom label function for millions
  labs(title = "Taxi Fare by Payment Type",
       x = " ",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

5) Trip.Total by Day of the week

trip_total_by_DayoftheWeek <- cleaned_taxi_df %>%
  group_by(Trip.Day.Of.The.Week) %>%
  summarise(TotalRevenue = sum(Trip.Total), .groups = 'drop')

# Define a named vector to map day numbers to day names
day_names <- c("1" = "Sunday", "2" = "Monday", "3" = "Tuesday", "4" = "Wednesday", 
               "5" = "Thursday", "6" = "Friday", "7" = "Saturday")

# Create the bar plot, using the named vector for axis labels
ggplot(trip_total_by_DayoftheWeek, aes(x = Trip.Day.Of.The.Week, y = TotalRevenue, fill = Trip.Day.Of.The.Week)) +
  geom_col() +
  scale_x_discrete(labels = day_names) +  # Use the day_names vector for axis labels
  scale_y_continuous(labels = label_millions) +
  labs(title = "Taxi fare by Day of the Week",
       x = " ",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

6) Trip.Total by Company

trip_total_by_Company <- cleaned_taxi_df %>%
  group_by(Company) %>%
  summarise(TotalRevenue = sum(Trip.Total) / 1e6, .groups = 'drop') %>%
  arrange(desc(TotalRevenue))

kable(trip_total_by_Company, digits = 2, col.names = c("Company", "Total Revenue (Millions $)"))
Company Total Revenue (Millions $)
Flash Cab 1.51
Taxi Affiliation Services 0.86
Sun Taxi 0.42
City Service 0.42
Taxicab Insurance Agency Llc 0.37
Chicago Independents 0.20
5 Star Taxi 0.19
Globe Taxi 0.16
Blue Ribbon Taxi Association 0.15
Medallion Leasin 0.12
Choice Taxi Association 0.05
Taxicab Insurance Agency, LLC 0.04
Choice Taxi Association Inc 0.04
Chicago City Taxi Association 0.03
U Taxicab 0.02
Top Cab 0.01
Koam Taxi Association 0.01
Taxi Affiliation Services Llc - Yell 0.01
Patriot Taxi Dba Peace Taxi Associat 0.01
Star North Taxi Management Llc 0.01
Chicago Taxicab 0.01
Metro Jet Taxi A. 0.00
3591 - 63480 Chuks Cab 0.00
Tac - Yellow Cab Association 0.00
Setare Inc 0.00
3556 - 36214 RC Andrews Cab 0.00
5167 - 71969 5167 Taxi Inc 0.00
Tac - Checker Cab Dispatch 0.00
312 Medallion Management Corp 0.00
6574 - Babylon Express Inc. 0.00
Petani Cab Corp 0.00
2733 - 74600 Benny Jona 0.00
4053 - 40193 Adwar H. Nikola 0.00

7) Trip.Total by time in a day

# Summarise Trip.Total by Company and arrange by TotalRevenue in descending order
trip_total_by_time <- cleaned_taxi_df %>%
  group_by(Trip.Hour.Of.The.Day) %>%
  summarise(TotalRevenue = sum(Trip.Total), .groups = 'drop') %>%
  arrange(desc(TotalRevenue))

ggplot(trip_total_by_time, aes(x = Trip.Hour.Of.The.Day, y = TotalRevenue, fill = TotalRevenue)) +
  geom_col() +
  scale_y_continuous(labels = label_thousands) +
  labs(title = "Taxi fare by time in a day",
       x = "Time",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
  scale_fill_gradient(low = "lightblue", high = "blue") 

7) Trips Count by Paymeny type

# Summarise Trip id by Payment Type
trip_count_by_PaymentType <- cleaned_taxi_df %>%
  group_by(Payment.Type) %>%
  summarise(TripCount = n_distinct(Trip.ID))

# Create a bar plot of Trip Total by Payment Type
ggplot(trip_count_by_PaymentType, aes(x = Payment.Type, y = TripCount, fill = Payment.Type)) +
  geom_col() +
  scale_y_continuous(labels = label_thousands) +
  labs(title = "Trips Count by Paymeny type",
       x = " ",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") 

8) Trips Count by day of the week

trip_count_by_DayoftheWeek <- cleaned_taxi_df %>%
  group_by(Trip.Day.Of.The.Week) %>%
  summarise(TripCount = n_distinct(Trip.ID))

# Define a named vector to map day numbers to day names
day_names <- c("1" = "Sunday", "2" = "Monday", "3" = "Tuesday", "4" = "Wednesday", 
               "5" = "Thursday", "6" = "Friday", "7" = "Saturday")

# Create the bar plot, using the named vector for axis labels
ggplot(trip_count_by_DayoftheWeek, aes(x = Trip.Day.Of.The.Week, y = TripCount, fill = Trip.Day.Of.The.Week)) +
  geom_col() +
  scale_x_discrete(labels = day_names) + 
  scale_y_continuous(labels = label_thousands) +
  labs(title = "Trips Count by Day of the Week",
       x = " ",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

9) Trips Count by Company

# Summarise Trip.Total by Company and arrange by TotalRevenue in descending order
trip_count_by_Company <- cleaned_taxi_df %>%
  group_by(Company) %>%
  summarise(TripCount = n_distinct(Trip.ID) / 1e3 , .groups = 'drop') %>%
  arrange(desc(TripCount))

# Display the full table sorted by Total Revenue
kable(trip_count_by_Company, digits=2, col.names = c("Company", "Trips Count (Thousands $)"))
Company Trips Count (Thousands $)
Flash Cab 75.98
Taxi Affiliation Services 53.39
Sun Taxi 28.14
City Service 27.04
Taxicab Insurance Agency Llc 26.32
Chicago Independents 14.53
5 Star Taxi 11.10
Globe Taxi 10.76
Blue Ribbon Taxi Association 10.47
Medallion Leasin 8.28
Taxicab Insurance Agency, LLC 3.88
Choice Taxi Association 3.08
Chicago City Taxi Association 3.06
Choice Taxi Association Inc 2.38
U Taxicab 1.64
Top Cab 1.26
Koam Taxi Association 1.03
Taxi Affiliation Services Llc - Yell 0.73
Patriot Taxi Dba Peace Taxi Associat 0.69
Star North Taxi Management Llc 0.44
Chicago Taxicab 0.41
3591 - 63480 Chuks Cab 0.32
Setare Inc 0.25
Metro Jet Taxi A. 0.24
Tac - Yellow Cab Association 0.18
3556 - 36214 RC Andrews Cab 0.09
5167 - 71969 5167 Taxi Inc 0.09
312 Medallion Management Corp 0.07
Tac - Checker Cab Dispatch 0.06
6574 - Babylon Express Inc. 0.03
Petani Cab Corp 0.00
2733 - 74600 Benny Jona 0.00
4053 - 40193 Adwar H. Nikola 0.00

10) Trips Count by time in a day

# Summarise Trip.Total by Company and arrange by TotalRevenue in descending order
trip_count_by_time <- cleaned_taxi_df %>%
  group_by(Trip.Hour.Of.The.Day) %>%
  summarise(TripCount = n_distinct(Trip.ID), .groups = 'drop') %>%
  arrange(desc(TripCount))

ggplot(trip_count_by_time, aes(x = Trip.Hour.Of.The.Day, y = TripCount, fill = TripCount)) +
  geom_col() +
  scale_y_continuous(labels = label_thousands) +
  labs(title = "Trips Count by time in a day",
       x = "Time",
       y = " ") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none") + 
  scale_fill_gradient(low = "lightblue", high = "blue")

detach(cleaned_taxi_df)
saveRDS(cleaned_taxi_df, "taxi_df_EDA.rds")

Some Key Findings from EDA

  1. Total taxi fare correlates with original fare, trip miles, tips, where does not correlates with tolls and trip hours of the day.

  2. People tends to use taxi on weekdays (especially on Wed > Tue > Mon), but not on weekends.

  3. Majority of people use Credit Card then Cash to pay the taxi fare.

  4. The most popular Pickup area are ‘8, 76, 32, 28’.

  5. The most popular Dropoff area are ‘8, 32, 28’.

  6. Top 5 most frequently used taxi companies in Chicago are Flash Cab, Taxi Affiliation Services, Sun Taxi, Taxicab Insurance Agency Llc, and City Service.

  7. The peak time for using taxi is 4pm~5pm. But by the taxi fare, the peak time is 3pm~7pm.

3. Model Selection

3.1 Feature Selection using Random Forests

cleaned_taxi_df <- readRDS("taxi_df_EDA.rds")
features_to_drop <- c('Trip.ID', 'Taxi.ID', 'Trip.Start.Timestamp', 'Trip.Start.Date', 'Payment.Type', 'Company', 'Tips', 'Tolls', 'Extras', 'Trip.Total')
selected_taxi_df <- cleaned_taxi_df %>% select(-one_of(features_to_drop))

factor_columns <- sapply(selected_taxi_df, is.factor)
selected_taxi_df[factor_columns] <- sapply(selected_taxi_df[factor_columns], function(x) as.numeric(levels(x))[x])

head(selected_taxi_df)
set.seed(123)

sample_size <- floor(0.5 * nrow(selected_taxi_df))
sample_indices <- sample(seq_len(nrow(selected_taxi_df)), size=sample_size)

random_forest <- randomForest(Fare ~ ., data = selected_taxi_df[sample_indices, ], importance = TRUE, ntree = 25)
importance(random_forest)
##                          %IncMSE IncNodePurity
## Trip.Miles             28.697780     7523341.9
## Pickup.Community.Area  10.293868     1318231.5
## Dropoff.Community.Area 11.591969     1421921.2
## Trip.Minutes           26.346702     5572350.4
## Trip.Hour.Of.The.Day   22.412510      440651.3
## Trip.Day.Of.The.Week    3.587485      210386.1

3.2 Feature Selection using Best Subset Selection

regfit.full <- regsubsets(Fare ~ ., data = selected_taxi_df)
reg.summary <- summary(regfit.full)
print(reg.summary)
## Subset selection object
## Call: regsubsets.formula(Fare ~ ., data = selected_taxi_df)
## 6 Variables  (and intercept)
##                        Forced in Forced out
## Trip.Miles                 FALSE      FALSE
## Pickup.Community.Area      FALSE      FALSE
## Dropoff.Community.Area     FALSE      FALSE
## Trip.Minutes               FALSE      FALSE
## Trip.Hour.Of.The.Day       FALSE      FALSE
## Trip.Day.Of.The.Week       FALSE      FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
##          Trip.Miles Pickup.Community.Area Dropoff.Community.Area Trip.Minutes
## 1  ( 1 ) "*"        " "                   " "                    " "         
## 2  ( 1 ) "*"        " "                   " "                    "*"         
## 3  ( 1 ) "*"        " "                   "*"                    "*"         
## 4  ( 1 ) "*"        "*"                   "*"                    "*"         
## 5  ( 1 ) "*"        "*"                   "*"                    "*"         
## 6  ( 1 ) "*"        "*"                   "*"                    "*"         
##          Trip.Hour.Of.The.Day Trip.Day.Of.The.Week
## 1  ( 1 ) " "                  " "                 
## 2  ( 1 ) " "                  " "                 
## 3  ( 1 ) " "                  " "                 
## 4  ( 1 ) " "                  " "                 
## 5  ( 1 ) " "                  "*"                 
## 6  ( 1 ) "*"                  "*"
highest_point <- which.max(reg.summary$adjr2)

plot(reg.summary$adjr2, xlab = "Number of Variables",
    ylab = "Adjusted RSq", type = "l")
points(highest_point, reg.summary$adjr2[highest_point], col = "red", cex = 2, 
    pch = 20)

plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")
lowest_point <- which.min(reg.summary$cp)
points(lowest_point, reg.summary$cp[lowest_point], col = "red", cex = 2, pch = 20)

As we see from the Adjusted R2 and Cp graphs, we would have the best performance on using all the 6 features. The individual importance of these features was calculated using the random forests where we found that Trip.Miles & Trip.Hour.Of.The.Day having the highest % Increase in MSE if not included while Trip.Miles & Trip.Minutes contributed the most to the increases in Node purities of the decision trees when used to split.

3.3 Fare prediction using Linear models with Lasso & Ridge

set.seed(123)

trainIndex <- createDataPartition(selected_taxi_df$Fare, p = 0.6, list = FALSE)
train_set <- selected_taxi_df[trainIndex,]
test_set <- selected_taxi_df[-trainIndex,]

cat("Training:", dim(train_set), "Testing:", dim(test_set))
## Training: 171577 7 Testing: 114381 7
x_train <- model.matrix(Fare ~ .-1, data = train_set)
y_train <- train_set$Fare

x_test <- model.matrix(Fare ~ .-1, data = test_set)
y_test <- test_set$Fare
grid <- 10^seq(10, -2, length = 100)
cv_ridge_fit <- cv.glmnet(x_train, y_train, alpha = 0, lambda = grid)
cat("Min value for lambda(Ridge):", cv_ridge_fit$lambda.min)
## Min value for lambda(Ridge): 0.01
plot(cv_ridge_fit)
title("Cross-Validation Lambda value for Ridge", line = 2.5)

grid <- 10^seq(10, -2, length = 100)
cv_lasso_fit <- cv.glmnet(x_train, y_train, alpha = 1, lambda = grid)
cat("Min value for lambda(lasso):", cv_lasso_fit$lambda.min)
## Min value for lambda(lasso): 0.01
plot(cv_lasso_fit)
title("Cross-Validation Lambda value for lasso", line = 2.5)

lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = cv_lasso_fit$lambda.min)
ridge_model <- glmnet(x_train, y_train, alpha = 0, lambda = cv_ridge_fit$lambda.min)

predicted_fare_lasso <- predict(lasso_model, newx = x_test)
predicted_fare_ridge <- predict(ridge_model, newx = x_test)
mse_lasso <- mean((y_test - predicted_fare_lasso)^2)
print(paste("Lasso Model - Mean Squared Error (MSE):", mse_lasso))
## [1] "Lasso Model - Mean Squared Error (MSE): 29.1224046335354"
mse_ridge <- mean((y_test - predicted_fare_ridge)^2)
print(paste("Ridge Model - Mean Squared Error (MSE):", mse_ridge))
## [1] "Ridge Model - Mean Squared Error (MSE): 29.1226404014018"
mae_lasso <- mean(abs(y_test - predicted_fare_lasso))
print(paste("Lasso Model - Mean Absolute Error (mae):", mae_lasso))
## [1] "Lasso Model - Mean Absolute Error (mae): 2.80794126799152"
mae_ridge <- mean(abs(y_test - predicted_fare_ridge))
print(paste("Ridge Model - Mean Absolute Error (mae):", mae_ridge))
## [1] "Ridge Model - Mean Absolute Error (mae): 2.80851565885102"
df_lasso <- data.frame(Actual = y_test, Prediction = c(predicted_fare_lasso))
ggplot(df_lasso, aes(x = Actual, y = Prediction)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  ggtitle("Lasso: Predicted vs Actual Values") +
  xlab("Actual Values") +
  ylab("Predicted Values")

df_ridge <- data.frame(Actual = y_test, Prediction = c(predicted_fare_ridge))
ggplot(df_ridge, aes(x = Actual, y = Prediction)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  ggtitle("Ridge: Predicted vs Actual Values") +
  xlab("Actual Values") +
  ylab("Predicted Values")

3.4 Fare prediction using Splines

smooth_spline <- lm(Fare ~ bs(Trip.Miles) + bs(Pickup.Community.Area) + bs(Dropoff.Community.Area) + bs(Trip.Minutes) + bs(Trip.Hour.Of.The.Day) + bs(Trip.Day.Of.The.Week), data = train_set)
summary(smooth_spline)
## 
## Call:
## lm(formula = Fare ~ bs(Trip.Miles) + bs(Pickup.Community.Area) + 
##     bs(Dropoff.Community.Area) + bs(Trip.Minutes) + bs(Trip.Hour.Of.The.Day) + 
##     bs(Trip.Day.Of.The.Week), data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.287  -1.955  -0.946   0.800  62.650 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 10.59614    0.10628  99.702  < 2e-16 ***
## bs(Trip.Miles)1             -8.40632    0.17624 -47.699  < 2e-16 ***
## bs(Trip.Miles)2             42.10012    0.29904 140.784  < 2e-16 ***
## bs(Trip.Miles)3             25.07331    0.44035  56.939  < 2e-16 ***
## bs(Pickup.Community.Area)1  -4.86976    0.17587 -27.690  < 2e-16 ***
## bs(Pickup.Community.Area)2   4.35223    0.15593  27.912  < 2e-16 ***
## bs(Pickup.Community.Area)3   0.38478    0.08298   4.637 3.54e-06 ***
## bs(Dropoff.Community.Area)1 -3.22463    0.16531 -19.506  < 2e-16 ***
## bs(Dropoff.Community.Area)2  3.62377    0.14404  25.158  < 2e-16 ***
## bs(Dropoff.Community.Area)3  0.78060    0.07482  10.433  < 2e-16 ***
## bs(Trip.Minutes)1            7.00863    0.15853  44.209  < 2e-16 ***
## bs(Trip.Minutes)2           25.38679    0.17381 146.060  < 2e-16 ***
## bs(Trip.Minutes)3           13.28938    0.17015  78.102  < 2e-16 ***
## bs(Trip.Hour.Of.The.Day)1   -2.73170    0.17948 -15.220  < 2e-16 ***
## bs(Trip.Hour.Of.The.Day)2   -5.33725    0.10511 -50.778  < 2e-16 ***
## bs(Trip.Hour.Of.The.Day)3   -0.89259    0.10933  -8.164 3.25e-16 ***
## bs(Trip.Day.Of.The.Week)1   -1.54938    0.11248 -13.775  < 2e-16 ***
## bs(Trip.Day.Of.The.Week)2   -1.29344    0.07933 -16.304  < 2e-16 ***
## bs(Trip.Day.Of.The.Week)3   -0.80240    0.05797 -13.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.999 on 171558 degrees of freedom
## Multiple R-squared:  0.7937, Adjusted R-squared:  0.7937 
## F-statistic: 3.667e+04 on 18 and 171558 DF,  p-value: < 2.2e-16
predicted_fare <- predict(smooth_spline, newdata = test_set)

mse <- mean((test_set$Fare - predicted_fare)^2)
print(paste("Mean Squared Error (MSE):", mse))
## [1] "Mean Squared Error (MSE): 24.8368407582032"
mae <- mean(abs(test_set$Fare - predicted_fare))
print(paste("Mean Absolute Error (MAE):", mae))
## [1] "Mean Absolute Error (MAE): 2.59340833446686"
rmse <- sqrt(mse)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 4.98365736765713"
train_predicted_fare <- predict(smooth_spline, newdata = train_set)
residuals <- residuals(smooth_spline)

plot_data <- data.frame(Predicted = train_predicted_fare, Residuals = residuals)

ggplot(plot_data, aes(x = Predicted, y = Residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(title = "Residual Plot", x = "Predicted Fare", y = "Residuals")

df_splines <- data.frame(Actual = test_set$Fare, Predicted = predicted_fare)
ggplot(df_splines, aes(x = Actual, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red") +
  ggtitle("Smoothing Splines: Predicted vs Actual Values") +
  xlab("Actual Values") +
  ylab("Predicted Values")

As we see from the test set results, smoothing splines with an error of (+,-)$3.5 perform better than the linear models of lasso & ridge with an error of (+,-)$3.9. But the residual analysis of the smoothing splines show that the residuals might be following a particular pattern as they all fall in a certain region.

3. Tipping analysis

features_to_drop <- c('Trip.ID', 'Taxi.ID', 'Trip.Start.Timestamp', 'Trip.Start.Date', 'Payment.Type', 'Company', 'Tolls', 'Extras', 'Trip.Total')
selected_taxi_df <- cleaned_taxi_df %>% select(-one_of(features_to_drop))

factor_columns <- sapply(selected_taxi_df, is.factor)
selected_taxi_df[factor_columns] <- sapply(selected_taxi_df[factor_columns], function(x) as.numeric(levels(x))[x])
summary(selected_taxi_df$Tips)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.235   2.220   9.250
model <- lm(Tips ~ ., data = selected_taxi_df)
summary(model)
## 
## Call:
## lm(formula = Tips ~ ., data = selected_taxi_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2715 -1.3258 -0.9724  0.9940  8.7408 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.3111689  0.0139370  94.078  < 2e-16 ***
## Trip.Miles             -0.0211006  0.0013904 -15.176  < 2e-16 ***
## Pickup.Community.Area  -0.0150702  0.0001931 -78.042  < 2e-16 ***
## Dropoff.Community.Area  0.0029834  0.0001862  16.019  < 2e-16 ***
## Fare                    0.0287299  0.0006143  46.769  < 2e-16 ***
## Trip.Minutes           -0.0123958  0.0005148 -24.079  < 2e-16 ***
## Trip.Hour.Of.The.Day    0.0019003  0.0006887   2.759  0.00579 ** 
## Trip.Day.Of.The.Week    0.0038429  0.0019172   2.004  0.04503 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 285950 degrees of freedom
## Multiple R-squared:  0.02729,    Adjusted R-squared:  0.02727 
## F-statistic:  1146 on 7 and 285950 DF,  p-value: < 2.2e-16
# Plot for Trip.Miles vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Miles, y = Tips)) +
  geom_point() +  
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +  # Linear regression line
  labs(title = "Trip Miles vs Tips", x = "Trip Miles", y = "Tips")

# Plot for Pickup.Community.Area vs Tips
ggplot(selected_taxi_df, aes(x = Pickup.Community.Area, y = Tips)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  labs(title = "Pickup Community Area vs Tips", x = "Pickup Community Area", y = "Tips")

# Plot for Dropoff.Community.Area vs Tips
ggplot(selected_taxi_df, aes(x = Dropoff.Community.Area, y = Tips)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  labs(title = "Dropoff Community Area vs Tips", x = "Dropoff Community Area", y = "Tips")

# Plot for Fare vs Tips
ggplot(selected_taxi_df, aes(x = Fare, y = Tips)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  labs(title = "Fare vs Tips", x = "Fare", y = "Tips")

# Plot for Trip.Minutes vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Minutes, y = Tips)) +
  geom_point() +
  geom_smooth(method = "lm",formula = y ~ x, se = FALSE) +
  labs(title = "Trip Minutes vs Tips", x = "Trip Minutes", y = "Tips")

# Plot for Trip.Hour.Of.The.Day vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Hour.Of.The.Day, y = Tips)) +
  geom_point() +
  geom_smooth(method = "lm",formula = y ~ x, se = FALSE) +
  labs(title = "Trip Hour Of The Day vs Tips", x = "Trip Hour Of The Day", y = "Tips")

# Plot for Trip.Day.Of.The.Week vs Tips
ggplot(selected_taxi_df, aes(x = Trip.Day.Of.The.Week, y = Tips)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  labs(title = "Trip Day Of The Week vs Tips", x = "Trip Day Of The Week", y = "Tips")